Python入门-编写抓取网站图片的爬虫-正则表达式
//生命太短 我用Python!
//Python真是让一直用c++的村里孩子长知识了!
这个仅仅是一个测试,成功抓取了某网站1000多张图片。
下一步要做一个大新闻 大工程
1 #config = utf-8 2 3 import urllib 4 import urllib2 5 import re 6 import os 7 8 global CNT 9 CNT = 0 10 11 def getHtml(url): 12 13 #! /usr/bin/env python 14 # -*- coding=utf-8 -*- 15 # @Author pythontab.com 16 #url="http://pythontab.com" 17 req_header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 18 'Accept':'text/html;q=0.9,*/*;q=0.8', 19 'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 20 'Accept-Encoding':'gzip', 21 'Connection':'close', 22 'Referer':None #注意如果依然不能抓取的话,这里可以设置抓取网站的host 23 } 24 req_header_2 = { 25 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:28.0) Gecko/20100101 Firefox/28.0' 26 } 27 28 req_timeout = 5 29 #status = urllib.urlopen(url).code 30 #print status 31 #if status != 200: 32 # print 'Http Error %s' % status 33 # return False 34 35 req = urllib2.Request(url,None,req_header_2) 36 resp = urllib2.urlopen(req,None,req_timeout) 37 html = resp.read() 38 return html 39 40 def getAllUrl(html): 41 reg = r'<a href="(.+)" target=' 42 theurl = re.compile(reg) 43 url = re.findall(theurl,html) 44 return url 45 46 def getNext(html): 47 reg = r"<a href='.+pai/(.+).html'" 48 nxtre = re.compile(reg) 49 nxt = re.findall(nxtre,html) 50 return nxt[0] 51 52 def getName(html): 53 reg = r'<title>(.+)</title>' 54 nare = re.compile(reg) 55 name = re.findall(nare,html) 56 return name[0] 57 58 def getImg(name,html): 59 global CNT 60 61 reg = r'<img src="(.{0,80}\.jpg)" border="0"' 62 imgre = re.compile(reg) 63 imglist = re.findall(imgre,html) 64 65 reg = r'src="(.{0,80}\.jpeg)" border' 66 imgre = re.compile(reg) 67 imglist.extend(re.findall(imgre,html)) 68 69 reg = r"<img src='(.{0,80}\.jpg)'" 70 imgre = re.compile(reg) 71 imglist.extend(re.findall(imgre,html)) 72 73 reg = r"<img src='(.{0,80}\.jepg)'" 74 imgre = re.compile(reg) 75 imglist.extend(re.findall(imgre,html)) 76 77 local = '.\%s-[%sp]' % (name,len(imglist)) 78 if os.path.exists(unicode(local,'utf-8')): 79 return unicode(local,'utf-8')+u'was existed' 80 81 os.mkdir(unicode(local,'utf-8')) 82 83 x = 0 84 for imgurl in imglist: 85 print imgurl 86 urllib.urlretrieve(imgurl,unicode(local+'\%s.jpg' % x,'utf-8')) 87 x+=1 88 CNT+=1 89 90 return unicode('%s: get %s pthoto(s)' % (name,x),'utf-8') 91 92 93 94 def getAll(num): 95 global CNT 96 nxt = 164680 97 while num > 0: 98 99 url = '---%s.html' % nxt 100 print nxt 101 html = getHtml(url) 102 nxt -= 1 103 num -= 1 104 if html == False: 105 print 'Error' 106 continue 107 108 print getImg(getName(html),html) 109 110 return 'done! %s photos!' % str(CNT) 111 112 def getAll_update(index): 113 global CNT 114 num = CNT 115 urls = getAllUrl(getHtml(index)) 116 117 for url in urls: 118 html = getHtml('---'+url) 119 print getImg(getName(html),html) 120 return 'done! %s photos!' % str(CNT-num) 121 122 123 #print getAll(10) 124 #html = getHtml('---') 125 #print getNext(html) 126 127 x = 3 128 while x < 50: 129 print getAll_update('---' % x) 130 x+=1 131 132 #print getAll_update('---')
header 伪装成浏览器
正则表达式 http://www.cnblogs.com/huxi/archive/2010/07/04/1771073.html //我也是刚刚学
基本都是一路百度写出来的